Main goal: to figure out what accounts the participants are following
Problem: Reverse chronological endpoint is capped (limited 10k tweets/month)
To get a sense of this, we are analyzing Rockwell pilot data and trying to estimate…
Strategy:
X: number of tweets collected
Y: fraction of distinct accounts appearing in these tweets pulled
⟹ See where the plot becomes flat (find the elbow point/saturating point)
# of distinct accounts appearing in tweets /
total # of distinct friends that each user has.# of distinct accounts appearing in tweets /
total # of distinct accounts that each user sees.# of tweets collected in all plots. However, the problem is
that this x-axis is not comparable across users since they have
different number of friends - e.g. some users follow lots of friends
thus naturally, they get more number of tweets; some others follow only
few accounts thereby getting few number of tweets. Since the y-axis is
in fraction/relative terms, the new rescaled x-axis is also added.
# of tweets collected)
by the average tweets per second of each user.# Load packages
library(readr)
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(grid)
library(gridExtra)
library(DT)
library(lubridate)
library(scales)
library(dplyr)
library(magrittr)
#**** Prepare data ****#
# Load data
# The data is cleaned and exported using Python.
# Python code for making dataframe for R is provided upon request.
df <- read_csv("df.csv")
backup <- df
# Define data type
df %>%
mutate(
user_id = as.factor(user_id),
tweet_id = as.factor(tweet_id),
friend_id = as.factor(account_id)
) %>%
dplyr::select(-account_id) -> df
# Making new variables
# (1) Maximum number of each user's friends: max_friends_count
# What is the maximum of user_friends_count?
df %>%
dplyr::select(user_id, user_friends_count) %>%
distinct() %>%
group_by(user_id) %>%
mutate(max_friends_count = max(user_friends_count)) %>%
dplyr::select(-user_friends_count) %>%
distinct() -> max_data
# max_data: a new dataframe with `user_id` and `max_friends_count` as variables
# Merge this 'max_data' into df
df %>%
merge(max_data, by="user_id") -> df
# (2) Timestamp data of each tweet: tweet_timestamp
# Let's clean timestamp data to have affinity with R lubridate pacakge
list_timestamp <- str_split(df$tweet_timestamp, " ") # make a list containing each string component of timestamp data
Month = c() # make an empty vector
Day = c()
Time = c()
Year = c()
timestamp_dmyt = c()
# fill these vectors with month, day, time, year components within each list element
for (i in 1:length(list_timestamp)) {
list_timestamp[[i]][2] -> Month[i]
list_timestamp[[i]][3] -> Day[i]
list_timestamp[[i]][4] -> Time[i]
list_timestamp[[i]][6] -> Year[i]
}
timestamp_dmyt = as.data.frame(cbind(Day, Month, Year, Time)) # bind these filled vectors and make it as a dataframe; store this dataframe as 'timestamp_dmyt'
# now paste the strings into one and store them in a vector 'dmyt'
dmyt = c()
for (i in 1:nrow(timestamp_dmyt)) {
dmyt[i] = paste(Day[i], Month[i], Year[i], Time[i])
}
# make 'dmyt' vector as a new variable of dataframe: 'tweet_timestamp'
df$tweet_timestamp = dmy_hms(dmyt) # timestamp format: day-month-year-hour-minute-second
# (3) Define x-axis: number of tweets collected
df %>%
arrange(tweet_timestamp) %>% # arrange the data by time
group_by(user_id) %>%
count(tweet_id) %>%
mutate(
old_x = cumsum(n), # old_x: number of tweets collected so far
max_n_tweets = max(old_x)
) %>%
dplyr::select(
user_id, tweet_id, old_x, max_n_tweets
) -> df_for_x
df %>%
inner_join(df_for_x, by=c("user_id", "tweet_id")) %>%
arrange(user_id, tweet_timestamp) -> df
#* [X] Re-scaling → divide x axis by the average tweets per second of each participant.
#* For each participant, (1) take the first and last tweet in the data and compute the number of seconds between them, and then (2) divide the total number of tweets seen for the participant by the number of seconds.
df |>
group_by(user_id) |>
summarise(timediff = max(tweet_timestamp) - min(tweet_timestamp)) -> timeDiff
df |>
merge(timeDiff, by="user_id") |>
group_by(user_id) |>
mutate(
avg_n_tweets_persec = max_n_tweets / as.numeric(timediff)
) |>
ungroup() |>
mutate(
new_x = old_x / avg_n_tweets_persec # new_x: number of tweets collected so far divided by the average number of tweets per seconds
) -> df2
# (4) Define y-axis: count how many distinct accounts are in the tweets (numerator)
#* [Y] Re-scaling → make a fraction for y-axis (new denominator as Brendan suggested : maximum of the total distinct accounts you "see" (not you have) as a new denominator to make all 60 indviduals reach 1 in the end (individual plots))
df2 %>%
arrange(user_id, tweet_timestamp) %>%
group_by(user_id) %>%
mutate(
numerator = cumsum(!duplicated(friend_id)),
old_y = numerator / max_friends_count,
new_y = numerator / max(numerator)
) -> df2
# df2 is the final data for drawing plots
| Variable | Definition |
|---|---|
| old_x | Number of tweets collected so far |
| new_x | Number of tweets collected so far / Average number of tweets per second |
| old_y | Fraction of distinct accounts appeared in tweets over the maximum number of friends each user has (= How many distinct accounts among all the friends that each user has have appeared in tweets pulled so far?) |
| new_y | Fraction of distinct accounts appeared in tweets over the maximum number of friends each user sees from the tweets pulled so far (thus everyone reaches 1 at the end) |
Plots 1a-1d are scatter plots displaying patterns of change in the fraction of distinct accounts as we pull tweets.
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=old_x, y=old_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Number of Tweets Collected") +
ylab("Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 120000),
labels = label_number(scale_cut = cut_short_scale())) +
ggtitle("Plot 1a", subtitle = "old x (# of tweets), \nold y (# distinct accounts/max friends count)") -> plot1a
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=old_x, y=new_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 120000),
labels = label_number(scale_cut = cut_short_scale())) +
ggtitle("Plot 1b", subtitle = "old x (# of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot1b
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=new_x, y=old_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 30000000),
labels = label_number(scale_cut = cut_short_scale())) +
ggtitle("Plot 1c", subtitle = "new x (rescaled # of tweets), \nold y (# distinct accounts/max friends count)") -> plot1c
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=new_x, y=new_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 30000000),
labels = label_number(scale_cut = cut_short_scale())) +
ggtitle("Plot 1d", subtitle = "new x (rescaled # of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot1d
grid.arrange(plot1a, plot1b, plot1c, plot1d, nrow=2)
What happens when we set the x-axis (and y-axis) to common logarithmic scales?
plot1a +
xlab("Log(Number of Tweets Collected)") +
scale_x_log10(n.breaks=10,
labels = scales::label_log()) +
ggtitle("Plot 2a", subtitle = "Logged old x (# of tweets), \nold y (# distinct accounts/max friends count)") -> plot2a
plot1b +
xlab("Log(Number of Tweets Collected)") +
scale_x_log10(n.breaks=10,
labels = scales::label_log()) +
ggtitle("Plot 2b", subtitle = "Logged old x (# of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot2b
plot1c +
xlab("Log(# of Tweets Collected / Avg # of Tweets per sec)") +
scale_x_log10(n.breaks=10,
labels = scales::label_log()) +
ggtitle("Plot 2c", subtitle = "Logged new x (rescaled # of tweets), \nold y (# distinct accounts/max friends count)") -> plot2c
plot1d +
xlab("Log(# of Tweets Collected / Avg # of Tweets per sec)") +
scale_x_log10(n.breaks=10,
labels = scales::label_log()) +
ggtitle("Plot 2d", subtitle = "Logged new x (rescaled # of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot2d
plot2c +
scale_y_log10(n.breaks=10, labels = scales::label_log()) +
ggtitle("Plot 2e", subtitle = "Logged new x (rescaled # of tweets), \nLogged old y (# distinct accounts/max friends count)") -> plot2e
plot2d +
scale_y_log10(n.breaks=10, labels = scales::label_log()) +
ggtitle("Plot 2f", subtitle = "Logged new x (rescaled # of tweets), \nLogged new y (# distinct accounts/max distinct accounts seen)") -> plot2f
grid.arrange(plot2a, plot2b, plot2c, plot2d, plot2e, plot2f, ncol=2)
Distribution of
|
|
df2 %>% dplyr::select(user_id, avg_n_tweets_persec) %>% unique() %>%
ggplot(aes((avg_n_tweets_persec))) +
geom_histogram(bins=10) +
theme_few() +
xlab("Log(avg # of tweets per sec)") +
scale_x_log10(labels = scales::label_log())
What if we aggregate users by taking the mean of fraction of distinct accounts (=y) at each point of the tweets collected (=x)?
For plots with the rescaled x-axis (=
# of tweets/avg # of tweets per sec), I applied binning on
the x-axis and then calculated weighted average of y.
This is due to the fact that avg # of tweets per sec
is highly distinct among users thus making it almost useless to group by
each point at the x-axis and summarizing the mean value of y.
Now that the same bin may contain multiple observations of the same user, or that some users may not appear at all in certain bins, I apply weighted average where each user is weighted by the number of observation in each bin.
# old_x, old_y
df2 %>%
group_by(old_x) %>%
summarize(y = mean(old_y)) %>%
ungroup() %>%
ggplot(aes(x=old_x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=1) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 120000),
labels = label_number(scale_cut = cut_short_scale())) +
ylim(c(0, 1)) +
ggtitle("Plot 3a", subtitle = "old x (# of tweets),\nold y (# distinct accounts/max friends count)") -> plot3a
# old_x, new_y
df2 %>%
group_by(old_x) %>%
summarize(y = mean(new_y)) %>%
ungroup() %>%
ggplot(aes(x=old_x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=1) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 120000),
labels = label_number(scale_cut = cut_short_scale())) +
ylim(c(0, 1)) +
ggtitle("Plot 3b", subtitle = "old x (# of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot3b
# make a new dataframe with binned x-axis and weighted mean
df2 %>%
mutate(
bins = cut(new_x ,
breaks = pretty(new_x, n = (max(new_x)-min(new_x))/100000), # 1057 levels
include.lowest = TRUE)) %>%
group_by(user_id, bins) %>%
mutate(weights = n()) %>%
ungroup() %>%
group_by(bins) %>%
summarise(old_y_weighted = weighted.mean(old_y, weights),
new_y_weighted = weighted.mean(new_y, weights)) %>%
ungroup() -> df3
# new_x, old_y
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
ggplot(aes(x=bins_x, y=old_y_weighted)) +
geom_point(alpha=0.5) +
theme_few() +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 1000),
labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3c", subtitle = "Binned new x (rescaled # of tweets), \nold y (# distinct accounts/max friends count)") -> plot3c
# new_x, new_y
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
ggplot(aes(x=bins_x, y=new_y_weighted)) +
geom_point(alpha=0.5) +
theme_few() +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 1000),
labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3d", subtitle = "Binned new x (rescaled # of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") -> plot3d
# Let's zoom in plot3c and plot3d:
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
filter(bins_x < 160) %>%
ggplot(aes(x=bins_x, y=old_y_weighted)) +
geom_point(alpha=0.5) +
theme_few() +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 160),
labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3c | Zoomed In", subtitle = "Binned new x (rescaled # of tweets), \nold y (# distinct accounts/max friends count)") +
geom_vline(xintercept=13, lty=2, color="darkcyan") +
geom_vline(xintercept=153, lty=2, color="darkcyan") +
geom_vline(xintercept=74, lty=2, color="darkcyan") -> plot3c_zoom
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
filter(bins_x < 160) %>%
ggplot(aes(x=bins_x, y=new_y_weighted)) +
geom_point(alpha=0.5) +
theme_few() +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 160),
labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3d | Zoomed In", subtitle = "Binned new x (rescaled # of tweets), \nnew y (# distinct accounts/max distinct accounts seen)") +
geom_vline(xintercept=13, lty=2, color="darkcyan") +
geom_vline(xintercept=153, lty=2, color="darkcyan") +
geom_vline(xintercept=74, lty=2, color="darkcyan") -> plot3d_zoom
grid.arrange(plot3a, plot3b, plot3c, plot3d, plot3c_zoom, plot3d_zoom, ncol=2)
Or this version of aggregate plots?
df2 %>%
group_by(old_x) %>%
summarize(y_old = mean(old_y), y_new = mean(new_y)) %>%
ungroup() %>%
pivot_longer(cols = c("y_old","y_new")) %>%
ggplot(aes(x=old_x, y=value, col=name)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 120000),
labels = label_number(scale_cut = cut_short_scale())) +
ylim(c(0, 1)) +
ggtitle("Plot 3ab: old x (# of tweets)", subtitle = "Pink: mean of new y \nBlue: mean of old y") -> plot3ab
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
pivot_longer(cols=c("old_y_weighted", "new_y_weighted")) %>%
ggplot(aes(x=bins_x, y=value, col=name)) +
geom_jitter(alpha=0.7, width=0.5, height=0.005) +
theme_few() +
theme(legend.position="none") +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits=c(0, 1000),
labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3cd: Binned new x (rescaled # of tweets)", subtitle = "Pink: weighted mean of new y \nBlue: weighted mean of old y") +
geom_vline(xintercept=13, lty=2, color="darkcyan") +
geom_vline(xintercept=153, lty=2, color="darkcyan") +
geom_vline(xintercept=74, lty=2, color="darkcyan") -> plot3cd
grid.arrange(plot3ab, plot3cd)
df2 %>%
mutate(
bins = cut(new_x ,
breaks = pretty(new_x, n = (max(new_x)-min(new_x))/100000), # 1057 levels
include.lowest = TRUE)) %>%
group_by(user_id, bins) %>%
mutate(weights = n()) %>%
mutate(bin_number = as.integer(bins)) %>%
rename(bin_range = bins) %>%
group_by(bin_number, bin_range) %>%
summarize(distinct_users = n_distinct(user_id)) -> table_bin
datatable(table_bin,
caption = "Bin No. & Bin Range",
filter="top")
Between bin no. 74 and bin no. 153 (the last two dashed lines), there are only handful of users (=5). Most users (=56) are within the first dashed line (bin no. 13).
We could conclude from this that …
We should zoom in on bins 0 to 13 and/or
Logarithmic binning would make more sense and provide a higher resolution for the left/starting part of the plots 3.
So, let’s first zoom in on the plot 3cd.
df3 %>%
mutate(bins_x = as.integer(bins)) %>%
filter(bins_x < 14) %>%
pivot_longer(cols=c("old_y_weighted", "new_y_weighted")) %>%
ggplot(aes(x=bins_x, y=value, col=name)) +
geom_point(alpha=0.7, width=0.5, height=0.005) +
geom_line() +
theme_few() +
theme(legend.position="none") +
xlab("Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks=13, labels = label_number()) +
ylim(c(0, 1)) +
ggtitle("Plot 3cd | Zoomed In: Binned new x (rescaled # of tweets)", subtitle = "Pink: weighted mean of new y \nBlue: weighted mean of old y") +
geom_vline(xintercept=13, lty=2, color="darkcyan")
Rather than throwing cases with bin > 13 away, let’s do the logarithmic binning on the x-axis.
# for the labels:
unlist(lapply(c(2^(1:30)), label_math(2^.x, format = log))) -> labels_list
labels_vector = c()
for (i in 1:length(labels_list)) {
labels_vector[i] = unlist(str_split(as.character(labels_list[i]), "\\."))[1] }
df2 %>%
mutate(
bins = cut(new_x ,
breaks = c(1,2^(1:30)),
labels = c(paste0('<=',labels_vector)),
include.lowest = TRUE)) %>%
group_by(user_id, bins) %>%
mutate(weights = n()) %>%
ungroup() %>%
group_by(bins) %>%
summarise(old_y_weighted = weighted.mean(old_y, weights),
new_y_weighted = weighted.mean(new_y, weights)) %>%
ungroup() -> df3
Let’s check how many users are in each logarithmic bin.
df2 %>%
mutate(
bins = cut(new_x ,
breaks = c(1,2^(1:30)),
labels = c(paste0('<=',labels_vector)),
include.lowest = TRUE)) %>%
group_by(user_id, bins) %>%
mutate(weights = n()) %>%
mutate(bin_number = as.integer(bins)) %>%
rename(bin_range = bins) %>%
group_by(bin_number, bin_range) %>%
summarize(distinct_users = n_distinct(user_id)) -> table_bin
datatable(table_bin,
caption = "Bin No. & Bin Range",
filter="top")
table_bin %>%
ggplot(aes(x=bin_range, y=distinct_users)) +
geom_col(fill='#ed4956') +
theme_few() +
labs(title='Distribution of Logarithmic Bins', x="Bin Ranges", y="Number of Distinct Users") +
scale_y_continuous(breaks = breaks_width(10)) +
scale_x_discrete(guide = guide_axis()) +
theme(axis.text.x=element_text(size=8))
Let’s replicate plots 3 (aggregate plots) with the new rescaled x-axis, which is logarithmic binned.
df3 %>%
pivot_longer(cols=c("old_y_weighted", "new_y_weighted")) %>%
ggplot(aes(x=bins, y=value, col=name)) +
geom_point(size = 3, aes(shape = name)) +
theme_few() +
theme(legend.position="none") +
xlab("Logarithmic Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
ylim(c(0, 1)) +
ggtitle("Plot 3cd | Replicated: Logarithmic Binned new x (rescaled # of tweets)",
subtitle = "Pink: weighted mean of new y \nBlue: weighted mean of old y") +
scale_x_discrete(guide = guide_axis(check.overlap = T)) -> plot3cd_log
grid.arrange(plot3ab, plot3cd_log)
What happens when we set the x-axis and/or y-axis to common logarithmic scales and replicate the aggregate plots?
plot3ab +
xlab("Log(# of Tweets Collected)") +
scale_x_log10(n.breaks=10,
labels = scales::label_log()) +
ggtitle("Plot 4A: Logged old x (# of tweets)",
subtitle = "Pink: mean of new y \nBlue: mean of old y") -> plot4a
plot4a +
ylab("Log(Mean Fraction of Distinct Accounts (%))") +
scale_y_log10(n.breaks=10, labels = scales::label_log()) +
ggtitle("Plot 4B: Logged old x (# of tweets)",
subtitle = "Pink: Logged weighted mean of new y \nBlue: Logged weighted mean of old y") -> plot4b
df3 %>%
pivot_longer(cols=c("old_y_weighted", "new_y_weighted")) %>%
ggplot(aes(x=bins, y=value, col=name)) +
geom_point(size = 2, aes(shape = name)) +
theme_few() +
theme(legend.position="none") +
ylab("Mean Fraction of Distinct Accounts (%)") +
ylim(c(0, 1)) +
xlab("Logarithmic Bins of [# of Tweets Collected / Avg # of Tweets per sec])") +
scale_x_discrete(guide = guide_axis(check.overlap = T)) +
ggtitle("Plot 4C: Logarithmic Binned new x (rescaled # of tweets)",
subtitle = "Pink: weighted mean of new y \nBlue: weighted mean of old y") -> plot4c
plot4c +
ylab("Log(Mean Fraction of Distinct Accounts (%))") +
scale_y_log10(n.breaks=10, labels = scales::label_log()) +
ggtitle("Plot 4D: Logarithmic Binned new x (rescaled # of tweets)",
subtitle = "Pink: Logged weighted mean of new y \nBlue: Logged weighted mean of old y") -> plot4d
grid.arrange(plot4a, plot4b, plot4c, plot4d)
In the data frame, there are 60 unique users. Let’s redraw some of the plots by each individual user. I allowed scales of the x-axis to vary for each user.
df2 %>%
ggplot(aes(x=old_x, y=old_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5,
labels = label_number(scale_cut = cut_short_scale())) +
facet_wrap(~user_id, scales="free_x", ncol = 10) +
ggtitle("Plot 5a", subtitle = "old x (# of tweets), \nold y (# distinct accounts/max friends count)")
df2 %>%
ggplot(aes(x=old_x, y=new_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5,
labels = label_number(scale_cut = cut_short_scale())) +
facet_wrap(~user_id, scales="free_x", ncol = 10) +
ggtitle("Plot 5b", subtitle = "old x (# of tweets), \nnew y (# distinct accounts/max distinct accounts seen)")
df2 %>%
ggplot(aes(x=new_x, y=old_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5,
labels = label_number(scale_cut = cut_short_scale())) +
facet_wrap(~user_id, scales="free_x", ncol = 10) +
ggtitle("Plot 5c", subtitle = "new x (rescaled # of tweets), \nold y (# distinct accounts/max friends count)")
df2 %>%
ggplot(aes(x=new_x, y=new_y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5,
labels = label_number(scale_cut = cut_short_scale())) +
facet_wrap(~user_id, scales="free_x", ncol = 10) +
ggtitle("Plot 5d", subtitle = "new x (rescaled # of tweets), \nnew y (# distinct accounts/max distinct accounts seen)")
It seems some people follow very few accounts while some others follow very many accounts. Let’s check distribution of the friends counts as well as maximum number of accounts observed in collected tweets.
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
distinct(user_id, max_friends_count, max_accounts_seen) %>%
arrange(-desc(max_friends_count)) -> table_dta
datatable(table_dta, filter="top")
Let’s remove users whose max_accounts_seen is less than 10 & more than 1,000 - and re-draw aggregate plots.
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
ungroup() %>%
group_by(old_x) %>%
summarize(y_old = mean(old_y), y_new = mean(new_y)) %>%
ungroup() %>%
pivot_longer(cols = c("y_old","y_new")) %>%
ggplot(aes(x=old_x, y=value, col=name)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_continuous(n.breaks = 10, limits = c(0, 45000),
labels = label_number(scale_cut = cut_short_scale())) +
ggtitle("Plot 6ab (w/o outliers): old x (# of tweets)", subtitle = "Pink: mean of new y \nBlue: mean of old y") -> plot6ab
df2 %>%
mutate(
bins = cut(new_x ,
breaks = c(1,2^(1:30)),
labels = c(paste0('<=',labels_vector)),
include.lowest = TRUE)) %>%
group_by(user_id, bins) %>%
mutate(weights = n()) %>%
ungroup() %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
ungroup() %>%
group_by(bins) %>%
summarise(old_y_weighted = weighted.mean(old_y, weights),
new_y_weighted = weighted.mean(new_y, weights)) %>%
ungroup() -> df4
df4 %>%
pivot_longer(cols=c("old_y_weighted", "new_y_weighted")) %>%
ggplot(aes(x=bins, y=value, col=name)) +
geom_point(size = 3, aes(shape = name)) +
theme_few() +
theme(legend.position="none") +
xlab("Logarithmic Bins of [# of Tweets Collected / Avg # of Tweets per sec]") +
ylab("Mean Fraction of Distinct Accounts (%)") +
scale_x_discrete(guide = guide_axis(check.overlap = T)) +
ggtitle("Plot 6cd (w/o outliers): new x (rescaled # of tweets)",
subtitle = "Pink: weighted mean of new y \nBlue: weighted mean of old y") +
geom_vline(xintercept=13, lty=2, color="darkcyan") -> plot6cd
# bin no. 29: (2,800,000 , 2,900,000]
# bin no. 77: (7,300,000 , 7,400,000]
grid.arrange(plot6ab, plot6cd)
I merged the lists of low quality sources (FIBER + NewsGuard) with my data frame using the screen names.
#* Note:
#* This is a sample code for merging low quality twitter accounts from different data sources
#* Here, I combine low quality sources from (1) FIBER lists, (2) NewsGuard lists
# Load packages
library(readr)
library(tidyverse)
library(dplyr)
# (1) FIBER
# Load all the FIB twitter lists
X2022_01_fib_indices_twitter <- read_csv("list/2022_01__fib_indices_twitter.csv")
X2022_02_fib_indices_twitter <- read_csv("list/2022_02__fib_indices_twitter.csv")
X2022_03_fib_indices_twitter <- read_csv("list/2022_03__fib_indices_twitter.csv")
X2022_04_fib_indices_twitter <- read_csv("list/2022_04__fib_indices_twitter.csv")
X2022_05_fib_indices_twitter <- read_csv("list/2022_05__fib_indices_twitter.csv")
X2022_06_fib_indices_twitter <- read_csv("list/2022_06__fib_indices_twitter.csv")
X2022_07_fib_indices_twitter <- read_csv("list/2022_07__fib_indices_twitter.csv")
X2022_08_fib_indices_twitter <- read_csv("list/2022_08__fib_indices_twitter.csv")
X2022_09_fib_indices_twitter <- read_csv("list/2022_09__fib_indices_twitter.csv")
X2022_10_fib_indices_twitter <- read_csv("list/2022_10__fib_indices_twitter.csv")
X2022_11_fib_indices_twitter <- read_csv("list/2022_11__fib_indices_twitter.csv")
X2022_12_fib_indices_twitter <- read_csv("list/2022_12__fib_indices_twitter.csv")
X2023_01_fib_indices_twitter <- read_csv("list/2023_01__fib_indices_twitter.csv")
X2023_02_fib_indices_twitter <- read_csv("list/2023_02__fib_indices_twitter.csv")
X2023_03_fib_indices_twitter <- read_csv("list/2023_03__fib_indices_twitter.csv")
X2023_04_fib_indices_twitter <- read_csv("list/2023_04__fib_indices_twitter.csv")
X2023_05_fib_indices_twitter <- read_csv("list/2023_05__fib_indices_twitter.csv")
X2023_06_fib_indices_twitter <- read_csv("list/2023_06__fib_indices_twitter.csv")
# Merge and de-duplicate the list
unique_list <- rbind(X2022_01_fib_indices_twitter, X2022_02_fib_indices_twitter, X2022_03_fib_indices_twitter,
X2022_04_fib_indices_twitter, X2022_05_fib_indices_twitter, X2022_06_fib_indices_twitter,
X2022_07_fib_indices_twitter, X2022_08_fib_indices_twitter, X2022_09_fib_indices_twitter,
X2022_10_fib_indices_twitter, X2022_11_fib_indices_twitter, X2022_12_fib_indices_twitter,
X2023_01_fib_indices_twitter, X2023_02_fib_indices_twitter, X2023_03_fib_indices_twitter,
X2023_04_fib_indices_twitter, X2023_05_fib_indices_twitter, X2023_06_fib_indices_twitter) %>%
dplyr::select(user_id, username) %>%
mutate(user_id = as.factor(user_id)) %>%
unique()
# Check the list: 155 unique users identified
# glimpse(unique_list)
# Make a new dataframe: fiber
unique_list %>%
dplyr::select(username) %>%
rename(screen_name = username) -> fiber
# (2) NewsGuard
# Load recent metadata csv file
metadata1 <- read_csv("metadata-2023063000.csv")
metadata2 <- read_csv("metadata-2023063001.csv")
metadata3 <- read_csv("metadata-2023063004.csv")
metadata4 <- read_csv("metadata-2023063006.csv")
metadata5 <- read_csv("metadata-2023063009.csv")
metadata6 <- read_csv("metadata-2023063011.csv")
metadata7 <- read_csv("metadata-2023063014.csv")
metadata8 <- read_csv("metadata-2023063017.csv")
metadata9 <- read_csv("metadata-2023063020.csv")
metadata10 <- read_csv("metadata-2023063023.csv")
# Filter only those twitter accounts with ratings "N" (low score)
rbind(metadata1, metadata2, metadata3, metadata4, metadata5,
metadata6, metadata7, metadata8, metadata9, metadata10) %>%
dplyr::select(Rating, Twitter) %>%
na.omit() %>%
filter(Rating == "N") -> metadata
# glimpse(metadata)
# Split the twitter link by '/'
list_str = str_split(unique(metadata$Twitter), "/")
# Let's clean the strings to keep only the 'handle'
list_handle = c()
for (i in 1:length(list_str)) {
list_handle[i] = list_str[[i]][4]
}
list_str = str_split(list_handle, "\\?")
list_handle = c()
for (i in 1:length(list_str)) {
list_handle[i] = list_str[[i]][1]
}
list_str = str_split(list_handle, "\\,")
list_handle = c()
for (i in 1:length(list_str)) {
list_handle[i] = list_str[[i]][1]
}
# Check
# list_handle
# Make a new dataframe: newsguard
newsguard = data.frame(screen_name = list_handle)
# Combine these two dataframes and de-duplicate
low_quality = rbind(newsguard, fiber) %>% unique()
# nrow(low_quality) # 787
# Make a new column indicating that these sources are low quality sources
low_quality %>%
mutate(
low_quality = "yes"
) -> low_quality
# df : exported dataframe from Python -> see 'pilot_re.Rmd' file
# Merge the low quality sources to the dataframe
backup %>%
dplyr::select(user_id, account_id, screen_name) %>%
left_join(low_quality, by="screen_name") -> df_lowq
# nrow(df_lowq) # 756404
#* When replicating the 'pilot_re.Rmd' file, use 'df_lowq' dataframe and change the y-axis:
#* distinct # of low quality sources appearing in Tweets
# Counting....
table(is.na(df_lowq$low_quality))
##
## FALSE TRUE
## 7322 749082
# FALSE 7322, TRUE 749082
# So, there are total 7322 cases (tweets) with low-quality sources
In the data frame, there are 7,322 rows where the account is low quality source among the total 756,404 rows.
I counted how many distinct number of low quality accounts are in the merged dataset, and the result is only 28.
# But how many distinct # of accounts?
df_lowq %>%
group_by(account_id) %>%
filter(!is.na(low_quality)) %>%
dplyr::select(-user_id) %>%
unique() -> table_lowq
# only 28
datatable(table_lowq,
caption = "How many distinct # of low quality accounts in the pilot data?",
filter="top")
# Then, how many users? 21
df_lowq %>%
group_by(user_id) %>%
filter(!is.na(low_quality)) %>%
dplyr::select(account_id) %>%
unique() %>%
count() %>%
arrange(desc(n)) -> table_user_lowq
datatable(table_user_lowq,
caption = "How many unique users are seeing at least on low quality accounts in their home timeline?",
filter="top")